package io.jpress.commons.word;

import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;

public class XWPFUtil {

    public static String parse(File file) throws Exception {

        // 1) Load docx with POI XWPFDocument
        XWPFDocument wordDocument = new XWPFDocument(new FileInputStream(file));


        // 2) Convert POI XWPFDocument 2 xhtml
        ByteArrayOutputStream out = new ByteArrayOutputStream();

        XHTMLOptions options = XHTMLOptions.create()
                .setImageManager(new Base64EmbedImgManager());

        XHTMLConverter.getInstance().convert(wordDocument, out, options);


//        return out.toString();
        return DocumentUtil.process(out.toString(), new String[]{"body", "p", "span"});
    }

}
